Parse the Who Monica Survey Core Data

Data available at: https://thl.fi/publications/monica/monograph_cd/formats/survey.htm

The file used here is the 20% sample of the survey core data found here: https://www.thl.fi/publications/monica/monograph_cd/data/form04_1.zip

Data dictionary is here: https://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
monicaform048 <- read_fwf("form04_3.zip", fwf_cols(form = c(1,2), versn=c(3,3), centre=c(4,5), runit=c(6,7), serial=c(8,13), numsur=c(14,14), samunit=c(15,17), dexam=c(18,25), mbirth=c(26,33), agegrp=c(34,34), sex=c(35,35), marit=c(36,36), edlevel=c(37,37), school=c(38,39), cigs=c(40,40), numcigs=c(41,43), daycigs=c(44,44), evercig=c(45,45), stop=c(46,49), iflyear=c(50,50), maxcigs=c(51.53), cigage=c(54,55), cigarsm=c(56,56), cigar=c(57,59), pipesm=c(60,60), pipe=c(61,63), othersm=c(64,65), hibp=c(66,66), drugs=c(67,67), bprecd=c(68,68), high=c(69,69), chdt=c(70,70), chrx=c(71,71), chrecd=c(72,72), asp=c(73,73), menop=c(74,74), agem=c(75,76), horm=c(77,77), pill=c(78,78), syst1=c(79,81), diast1=c(82,84), rz1=c(85,86), syst2=c(87,89), diast2=c(90,92), rz2=c(93,94), cuff=c(95,95), arm=c(96,97), bpcoder=c(98,99), timebp=c(100,103), rtemp=c(104,105), chol=c(106,108), choldl=c(109,111), dchol=c(112,119), hdl=c(120,122), hdldl=c(123,125), dhdl=c(126,133), scn=c(134,136), cotin=c(137,140), carbmon=c(141,142), height=c(143,145), weight=c(146,149), waist=c(150,153), hip=c(154,157), whcoder=c(158,159), oversion=c(160,160), eage=c(161,162), eageg=c(163,163), cohort1=c(164,164), cohort2=c(165,165), edtert1=c(166,166), edtert2=c(167,167), systm=c(168,171), diastm=c(172,175), chola=c(176,179), hdla=c(180,183)))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   form = col_character(),
##   runit = col_character(),
##   serial = col_character(),
##   dexam = col_character(),
##   school = col_character(),
##   numcigs = col_character(),
##   cigage = col_character(),
##   cigar = col_character(),
##   pipe = col_character(),
##   othersm = col_character(),
##   syst1 = col_character(),
##   diast1 = col_character(),
##   rz1 = col_character(),
##   syst2 = col_character(),
##   diast2 = col_character(),
##   rz2 = col_character(),
##   cuff = col_character(),
##   arm = col_character(),
##   bpcoder = col_character(),
##   timebp = col_character()
##   # ... with 14 more columns
## )
## See spec(...) for full column specifications.
monicaform048
## # A tibble: 11,833 x 75
##    form  versn centre runit serial numsur samunit dexam mbirth agegrp   sex
##    <chr> <dbl>  <dbl> <chr> <chr>   <dbl>   <dbl> <chr>  <dbl>  <dbl> <dbl>
##  1 04        8     10 02    071295      3     888 1805… 9.91e7      1     1
##  2 04        8     10 02    071302      3     888 2605… 9.91e7      1     1
##  3 04        8     10 02    071310      3     888 2005… 9.91e7      1     1
##  4 04        8     10 02    071320      3     888 2607… 9.91e7      1     1
##  5 04        8     10 02    071328      3     888 2107… 9.90e7      1     1
##  6 04        8     10 02    071334      3     888 1208… 9.91e7      1     1
##  7 04        8     10 02    071344      3     888 2906… 9.91e7      1     1
##  8 04        8     10 02    071351      3     888 0308… 9.91e7      1     1
##  9 04        8     10 02    071358      3     888 0111… 9.91e7      1     1
## 10 04        8     10 02    071366      3     888 2507… 9.90e7      1     1
## # … with 11,823 more rows, and 64 more variables: marit <dbl>,
## #   edlevel <dbl>, school <chr>, cigs <dbl>, numcigs <chr>, daycigs <dbl>,
## #   evercig <dbl>, stop <dbl>, iflyear <dbl>, maxcigs <dbl>, cigage <chr>,
## #   cigarsm <dbl>, cigar <chr>, pipesm <dbl>, pipe <chr>, othersm <chr>,
## #   hibp <dbl>, drugs <dbl>, bprecd <dbl>, high <dbl>, chdt <dbl>,
## #   chrx <dbl>, chrecd <dbl>, asp <dbl>, menop <dbl>, agem <dbl>,
## #   horm <dbl>, pill <dbl>, syst1 <chr>, diast1 <chr>, rz1 <chr>,
## #   syst2 <chr>, diast2 <chr>, rz2 <chr>, cuff <chr>, arm <chr>,
## #   bpcoder <chr>, timebp <chr>, rtemp <dbl>, chol <chr>, choldl <dbl>,
## #   dchol <chr>, hdl <chr>, hdldl <chr>, dhdl <chr>, scn <chr>,
## #   cotin <dbl>, carbmon <dbl>, height <dbl>, weight <chr>, waist <chr>,
## #   hip <chr>, whcoder <chr>, oversion <dbl>, eage <dbl>, eageg <dbl>,
## #   cohort1 <dbl>, cohort2 <dbl>, edtert1 <dbl>, edtert2 <dbl>,
## #   systm <chr>, diastm <chr>, chola <chr>, hdla <chr>

Parse dates

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
mf = monicaform048 %>% mutate(
    dexam=parse_date_time(gsub("(99)+", "", dexam), orders=c("dmy", "my", "y")),
    mbirth=parse_date_time(gsub("99", "", substr(mbirth, 3, nchar(mbirth))), orders=c("my", "y")),
    high = as.integer(high), # ever told by doc or provider that you have high cholesterol 1=yes, 2=no, 9=insufficient data
    chdt = as.integer(chdt), # on a special diet for high chol? 1 = yes, 2=no, 3=uncertain, 8=never told they had high chol, 9=insufficient data
    chrx = as.integer(chrx), # taking meds (in last two weeks) 1=yes, 2=no, 3=uncertain, 8=never told had high chol, 9=insufficient
    dchol=dmy(dchol),
    chol=as.integer(chol)/10, # Total serum cholesterol (mmol/l)
    choldl=as.integer(choldl), # Total serum cholesterol (mg/dl)
    hdl=as.integer(hdl)/100.,
    hdldl=as.integer(hdldl),
    height=as.integer(height)/100,
    weight=as.integer(weight)*100, # recorded in 100 g units
    eage=as.integer(eage), # Age on date of examination
    eageg=as.integer(eageg)) # Age group on date of exam, but see data dict above, because coding varies by data collection center
## Warning: 3468 failed to parse.
mf <- mf %>% select(sex, eage, high, chdt, chrx, chol, choldl, weight, height)
mf
## # A tibble: 11,833 x 9
##      sex  eage  high  chdt  chrx  chol choldl weight height
##    <dbl> <int> <int> <int> <int> <dbl>  <int>  <dbl>  <dbl>
##  1     1    33     2     8     8  88.8    888  68800   1.7 
##  2     1    32     2     8     8   4.7    888  88800   1.78
##  3     1    31     2     8     8   4.6    888  91000   1.63
##  4     1    32     2     8     8   4.7    888  68800   1.69
##  5     1    30     2     8     8   4.1    888  61200   1.74
##  6     1    30     2     8     8   5.3    888  68600   1.78
##  7     1    31     2     8     8   6.8    888  77600   1.86
##  8     1    33     2     8     8  99.9    888 999900   9.99
##  9     1    32     2     8     8   6.2    888  59600   1.73
## 10     1    30     2     8     8   4.5    888  66600   1.83
## # … with 11,823 more rows
mf <- mf %>% 
    filter(chol < 99.9, eage < 99, high < 9, chdt != 3, chdt != 9,
           chrx != 3, chrx != 9,
           choldl < 999, (chol < 88.8 | choldl < 888), 
            weight < 999900, height < 9.99) %>% 
    mutate(weight=weight/1000) %>% 
    mutate(bmi=weight/height^2) %>% 
    mutate(chol=ifelse(chol < 88.8, chol, choldl*0.02586)) %>% 
    mutate(hychol=ifelse(chol>=6.5, 1, 0), bmigrp=case_when(bmi<25 ~ 1, bmi<30~2, TRUE ~3)) %>%
    mutate(chdt=ifelse(chdt==8, 2, 1), chrx=ifelse(chrx==8, 2, 1)) %>% # If either indicates high was no, assume they are not on diet or meds for high cholesterol
    select(-choldl)
mf
## # A tibble: 9,006 x 11
##      sex  eage  high  chdt  chrx  chol weight height   bmi hychol bmigrp
##    <dbl> <int> <int> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl>  <dbl>  <dbl>
##  1     1    32     2     2     2   4.7   88.8   1.78  28.0      0      2
##  2     1    31     2     2     2   4.6   91     1.63  34.3      0      3
##  3     1    32     2     2     2   4.7   68.8   1.69  24.1      0      1
##  4     1    30     2     2     2   4.1   61.2   1.74  20.2      0      1
##  5     1    30     2     2     2   5.3   68.6   1.78  21.7      0      1
##  6     1    31     2     2     2   6.8   77.6   1.86  22.4      1      1
##  7     1    32     2     2     2   6.2   59.6   1.73  19.9      0      1
##  8     1    30     2     2     2   4.5   66.6   1.83  19.9      0      1
##  9     1    25     2     2     2   5.4   82.6   1.8   25.5      0      2
## 10     1    29     2     2     2   3.6   58.4   1.65  21.5      0      1
## # … with 8,996 more rows
write_csv(mf, "monica-chol.csv")
summary(mf)
##       sex             eage            high            chdt      
##  Min.   :1.000   Min.   :25.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:37.00   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :46.00   Median :2.000   Median :2.000  
##  Mean   :1.516   Mean   :45.84   Mean   :1.851   Mean   :1.843  
##  3rd Qu.:2.000   3rd Qu.:55.00   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :2.000   Max.   :64.00   Max.   :2.000   Max.   :2.000  
##       chrx            chol            weight           height     
##  Min.   :1.000   Min.   : 2.500   Min.   : 33.00   Min.   :1.220  
##  1st Qu.:2.000   1st Qu.: 4.888   1st Qu.: 63.00   1st Qu.:1.610  
##  Median :2.000   Median : 5.600   Median : 72.20   Median :1.670  
##  Mean   :1.844   Mean   : 5.677   Mean   : 73.65   Mean   :1.677  
##  3rd Qu.:2.000   3rd Qu.: 6.400   3rd Qu.: 82.70   3rd Qu.:1.740  
##  Max.   :2.000   Max.   :17.300   Max.   :193.40   Max.   :2.000  
##       bmi            hychol           bmigrp     
##  Min.   :15.07   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:22.95   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :25.47   Median :0.0000   Median :2.000  
##  Mean   :26.14   Mean   :0.2346   Mean   :1.719  
##  3rd Qu.:28.57   3rd Qu.:0.0000   3rd Qu.:2.000  
##  Max.   :64.08   Max.   :1.0000   Max.   :3.000
ggplot(mf, aes(x = eage, y = chol)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)

ggplot(mf, aes(x = bmi, y = chol)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)

ggplot(mf, aes(x = eage, y = weight)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(ggplot(mf, aes(x=bmi, fill=as.factor(sex))) + geom_histogram(binwidth = 1, alpha = 0.3, position="identity"))
plot_histogram <- function(df, feature) {
    plt <- ggplot(df, aes(x=eval(parse(text=feature)))) +
    geom_histogram(aes(y = ..density..), alpha=0.7, fill="#33AADE", color="black") +
    geom_density(alpha=0.3, fill="red") +
    geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
    labs(x=feature, y = "Density")
    print(plt)
}

plot_multi_histogram <- function(df, feature, label_column) {
    plt <- ggplot(df, aes(x=eval(parse(text=feature)), fill=eval(parse(text=label_column)))) +
    geom_histogram(alpha=0.7, position="identity", aes(y = ..density..), color="black") +
    geom_density(alpha=0.7) +
    geom_vline(aes(xintercept=mean(eval(parse(text=feature)))), color="black", linetype="dashed", size=1) +
    labs(x=feature, y = "Density")
    plt + guides(fill=guide_legend(title=label_column))
}

plot_multi_histogram(mf %>% mutate(sex=as.factor(sex)), 'bmi', 'sex')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.